Introduction

Row

Correlation of CMR genes with mean GC content

Statistics of linear regression of CMR genes with mean GC content

Stats values
r.squared 0.825525461273074
adj.r.squared 0.825502588365532
fstatistic 36091.846205977 1 7628
sigma 5.1021475465549
df 2 7628 2

Anova analysis of CMR genes with mean GC content

group Residuals
Df 1.00 7628.00000
Sum Sq 939539.68 198571.40633
Mean Sq 939539.68 26.03191
F value 36091.85 NA
Pr(>F) 0.00 NA

Row

GC content difference between CMR genes and non-CMR genes

The statistics of GC content difference between CMR genes and non-CMR genes

Stats CMR nonCMR
Min 0.3542099 0.2904040
First quantile 0.4199185 0.4338184
Median 0.4392201 0.5102994
Third quantile 0.4650664 0.6229839
Max 0.5327107 0.8321995

GC content density difference between CMR genes and non-CMR genes

---
title: Genomic Feature Analysis of CMR
output: 
  flexdashboard::flex_dashboard:
    orientation: rows
    social: menu
    source_code: embed
    theme: cosmo
---

```{r setup, include=FALSE}
library(flexdashboard)
library(rtracklayer)
library(plotly)
library(ggplot2)
library(seqinr)
library(GenomicFeatures)
library(knitr)
library(kableExtra)
options(scipen = 20)
knitr::opts_chunk$set(echo = TRUE,
                      warning = FALSE,
                      message = FALSE)
```


```{r echo = FALSE, warning = FALSE, message = FALSE}
# gene GC content
getGCContent <- function(inputSeq){
  res <- unlist(lapply(inputSeq, function(x) GC(s2c(as.character(x)))))
  res
}

CMRGene <- read.table(file = CMRGeneDir, sep = "\t", header = F, quote = '',
                      stringsAsFactors = F)
CMRGene <- CMRGene$V1
GTF <- import(con = GTFDir)
GTF <- subset(GTF, GTF$gene_biotype == "protein_coding")
txdb <- makeTxDbFromGFF(file = GTFDir)
GTFGene <- data.frame(subset(GTF, GTF$type == "gene"))
rownames(GTFGene) <- GTFGene$gene_id
GTFdfGeneBed <- data.frame(GTFGene$seqnames,
                           GTFGene$start-1,
                           GTFGene$end,
                           GTFGene$gene_id,
                           ".",
                           GTFGene$strand)
write.table(GTFdfGeneBed, file = paste0(getwd(), "/gene.bed"),
            sep = '\t', quote = F, row.names = F, col.names = F)
cmd <- paste0('bedtools getfasta -s -name -bed ',  paste0(getwd(), "/gene.bed "),
              " -fi ", genome, " -fo ", paste0(getwd(), "/gene.fasta"))
system(command = cmd)
geneSeq <- read.fasta(paste0(getwd(), "/gene.fasta"), as.string = T)
names(geneSeq) <- gsub(pattern = "\\(|\\+|\\-|\\)", replacement = "", x = names(geneSeq))
geneSeq <- lapply(geneSeq, as.character)

seqNames <- na.omit(as.numeric(as.character(unique(seqnames(GTF)@values))))

GTFdf <- data.frame(GTF)

resFreq <- NULL
resGCContent <- NULL
for(i in 1:length(seqNames)){
  curGTF <- subset(GTFdf, GTFdf$seqnames == i)
  curGeneGTF <- subset(curGTF, curGTF$type == "gene")
  curGeneGTF <- curGeneGTF[order(curGeneGTF$start), ]
  rownames(curGeneGTF) <- curGeneGTF$gene_id
  orderGene <- rownames(curGeneGTF)

  curLength <- length(unique(curGTF$gene_id))
  curStart <- seq(1, curLength - seqLength + stepSize, stepSize)
  curEnd <- c(curStart[1:(length(curStart)-1)] + seqLength - 1, curLength)
  curMat <- cbind(curStart, curEnd)
  CMRFreq <- apply(curMat, 1,
                   function(x) length(intersect(orderGene[x[1]:x[2]],
                                                CMRGene))/seqLength)
  curGCContent <- unlist(apply(curMat, 1,
                               function(x) mean(getGCContent(geneSeq[orderGene[x[1]:x[2]]]))))
  resFreq <- c(resFreq, CMRFreq)
  resGCContent <- c(resGCContent, curGCContent)
}
```


### Introduction
- The function provides the correlation analysis of CMR genes with multiple gene features including **Mean gene length**, **Mean exon number**, **Mean GC content** and **Mean distance to adjacent gene**. To be specific, we split the maize genome in a sliding window of 100 adjacent genes with step size of 10, and calculate the frequency of CMR genes in each window (100 adjacent genes).


Row {data-height=250}
---------------------------------------------------------------

### Correlation of CMR genes with mean gene length

```{r echo = FALSE, warning=FALSE, message=FALSE}
df <- data.frame(freq = resFreq*100, 
                 GCContent = resGCContent*100)
p1 <- ggplot(df, aes(x = freq, y = GCContent)) + 
      geom_point(colour = "grey") + 
      geom_smooth(method = "lm", se = FALSE, colour = "red") + 
      labs(x = "Frequency of CMR genes (%)", y = "Mean GC content")
ggplotly(p1)
```



### Statistics of linear regression of CMR genes with mean GC content

```{r echo = FALSE, warning = FALSE, message = FALSE}
group <- gl(n = 2, k = nrow(df), labels = c("freq", "GCContent"))
values <- c(df$freq, df$GCContent)
lmRes <- lm(values ~ group)
res.summary <- summary(lmRes)
res.table <- data.frame(Stats = c("r.squared",
                                  "adj.r.squared",
                                  "fstatistic",
                                  "sigma",
                                  "df"),
                        values = c(res.summary$r.squared,
                                   res.summary$adj.r.squared,
                                   paste(res.summary$fstatistic, collapse = " "),
                                   res.summary$sigma,
                                   paste(res.summary$df, collapse = " ")))
knitr::kable(x = res.table, align = 'c') %>%  
  kableExtra::kable_styling(position = 'center', full_width = FALSE, 
                            stripe_color = 'black', latex_options = "bordered")
```


### Anova analysis of CMR genes with mean GC content

```{r echo = FALSE, warning = FALSE, message = FALSE}
knitr::kable(x = t(anova(lmRes)), align = "c") %>%
  kableExtra::kable_styling(position = "center", full_width = FALSE,
                            stripe_color = "black", latex_options = "bordered")
```





Row {data-height=250}
---------------------------------------------------------------

```{r echo = FALSE, warning = FALSE, message = FALSE}
nonCMRGene <- setdiff(unique(GTFdf$gene_id), CMRGene)
nonCMRGene <- sample(nonCMRGene, size = ratio*length(CMRGene))
```

### GC content difference between CMR genes and non-CMR genes
```{r echo = FALSE, warning=FALSE, message=FALSE}
cmr.GCContent <- as.numeric(getGCContent(geneSeq[CMRGene]))
noncmr.GCContent <- as.numeric(getGCContent(geneSeq[nonCMRGene]))

cmrStat <- boxplot.stats(cmr.GCContent)$stats
tmpCMR <- cmr.GCContent[which(cmr.GCContent <= cmrStat[5])]

noncmrStat <- boxplot.stats(noncmr.GCContent)$stats
tmpNonCMR <- noncmr.GCContent[which(noncmr.GCContent <= noncmrStat[5])]

df <- data.frame(type = factor(rep(c("CMRGene", "nonCMRGene"),
                                   c(length(tmpCMR), length(tmpNonCMR)))),
                 value = c(tmpCMR*100, tmpNonCMR*100))

p <- ggplot(df, aes(x=type, y=value, fill=type)) + 
  geom_violin(trim = FALSE) +
  theme(axis.title.x = element_blank(),
        #axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.title = element_blank(),
        legend.position = "none")
ggplotly(p)
```

### The statistics of GC content difference between CMR genes and non-CMR genes
```{r echo = FALSE, warning=FALSE, message=FALSE}
res <- data.frame(Stats = c("Min", "First quantile", "Median", "Third quantile", "Max"),
                  CMR = cmrStat, nonCMR = noncmrStat)
knitr::kable(x = res, align = 'c') %>%  
  kableExtra::kable_styling(position = 'center', full_width = FALSE, 
                            stripe_color = 'black', latex_options = "bordered")
```




### GC content density difference between CMR genes and non-CMR genes
```{r echo = FALSE, warning = FALSE, message = FALSE}
p <- ggplot(df, aes(x=value, fill=type)) + 
  geom_density(alpha = 0.5) +
  theme(axis.title.x = element_blank(),
        #axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.title = element_blank(),
        legend.position = "none")
ggplotly(p)
```